In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as scipy_stats
import plotly.graph_objects as go
In [7]:
# Load the dataset
ecommerce_df = pd.read_csv('ecommerce_customer_behavior_dataset_v2.csv')
print(f'✓ Dataset loaded successfully')
print(f'Shape: {ecommerce_df.shape[0]} rows × {ecommerce_df.shape[1]} columns')
✓ Dataset loaded successfully Shape: 17049 rows × 18 columns
In [8]:
# Display column names and data types
print('Column Names and Data Types:')
print('=' * 50)
for col, dtype in ecommerce_df.dtypes.items():
print(f'{col:30s} → {str(dtype)}')
Column Names and Data Types: ================================================== Order_ID → object Customer_ID → object Date → object Age → int64 Gender → object City → object Product_Category → object Unit_Price → float64 Quantity → int64 Discount_Amount → float64 Total_Amount → float64 Payment_Method → object Device_Type → object Session_Duration_Minutes → int64 Pages_Viewed → int64 Is_Returning_Customer → bool Delivery_Time_Days → int64 Customer_Rating → int64
In [9]:
# Display first few rows
print('First 5 Rows of Dataset:')
print('=' * 50)
print(ecommerce_df.head())
First 5 Rows of Dataset:
==================================================
Order_ID Customer_ID Date Age Gender City \
0 ORD_000001-1 CUST_00001 2023-05-29 40 Male Ankara
1 ORD_000001-2 CUST_00001 2023-10-12 40 Male Ankara
2 ORD_000001-3 CUST_00001 2023-12-05 40 Male Ankara
3 ORD_000002-1 CUST_00002 2023-05-11 33 Male Istanbul
4 ORD_000002-2 CUST_00002 2023-06-16 33 Male Istanbul
Product_Category Unit_Price Quantity Discount_Amount Total_Amount \
0 Books 29.18 1 0.00 29.18
1 Home & Garden 644.40 1 138.05 506.35
2 Sports 332.82 5 0.00 1664.10
3 Food 69.30 5 71.05 275.45
4 Beauty 178.15 3 0.00 534.45
Payment_Method Device_Type Session_Duration_Minutes Pages_Viewed \
0 Digital Wallet Mobile 14 9
1 Credit Card Desktop 14 8
2 Credit Card Mobile 15 10
3 Digital Wallet Desktop 16 13
4 Credit Card Mobile 14 7
Is_Returning_Customer Delivery_Time_Days Customer_Rating
0 True 13 4
1 True 6 2
2 True 9 4
3 True 4 4
4 True 6 4
In [11]:
# Calculate missing values for each column
missing_counts = ecommerce_df.isnull().sum()
missing_percentages = (missing_counts / len(ecommerce_df) * 100).round(2)
# Create a summary DataFrame
missing_summary_df = pd.DataFrame({
'Column': missing_counts.index,
'Missing_Count': missing_counts.values,
'Missing_Percentage': missing_percentages.values
})
# Sort by missing count descending
missing_summary_df = missing_summary_df.sort_values('Missing_Count', ascending=False)
print(f"Missing Values Analysis:")
print(f"Total rows: {len(ecommerce_df)}")
print(f"\nColumns with missing values:")
print(missing_summary_df[missing_summary_df['Missing_Count'] > 0].to_string(index=False))
print(f"\nColumns with no missing values: {len(missing_summary_df[missing_summary_df['Missing_Count'] == 0])}")
Missing Values Analysis: Total rows: 17049 Columns with missing values: Empty DataFrame Columns: [Column, Missing_Count, Missing_Percentage] Index: [] Columns with no missing values: 18
In [12]:
# Create visualization showing missing data patterns
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot 1: Bar chart of missing percentages
missing_pct_series = missing_summary_df.set_index('Column')['Missing_Percentage']
axes[0].barh(range(len(missing_pct_series)), missing_pct_series.values, color='steelblue')
axes[0].set_yticks(range(len(missing_pct_series)))
axes[0].set_yticklabels(missing_pct_series.index, fontsize=9)
axes[0].set_xlabel('Missing Percentage (%)', fontsize=11)
axes[0].set_title('Missing Data by Column (%)', fontsize=12, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)
# Plot 2: Heatmap of missing values pattern (sample of data)
sample_size = min(500, len(ecommerce_df))
missing_matrix = ecommerce_df.head(sample_size).isnull().astype(int)
axes[1].imshow(missing_matrix.T, aspect='auto', cmap='RdYlGn_r', interpolation='nearest')
axes[1].set_yticks(range(len(ecommerce_df.columns)))
axes[1].set_yticklabels(ecommerce_df.columns, fontsize=9)
axes[1].set_xlabel(f'Rows (first {sample_size})', fontsize=11)
axes[1].set_title('Missing Data Pattern Heatmap', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()
print(f"Visualization shows missing data patterns across {len(ecommerce_df.columns)} columns")
Visualization shows missing data patterns across 18 columns
In [13]:
# Calculate overall data completeness
total_cells = ecommerce_df.shape[0] * ecommerce_df.shape[1]
total_missing = missing_summary_df['Missing_Count'].sum()
completeness_rate = ((total_cells - total_missing) / total_cells * 100).round(2)
print(f"=== Data Completeness Summary ===")
print(f"Total cells: {total_cells:,}")
print(f"Missing cells: {total_missing:,}")
print(f"Complete cells: {(total_cells - total_missing):,}")
print(f"Completeness rate: {completeness_rate}%")
print(f"\n✓ All {len(ecommerce_df.columns)} columns are 100% complete")
print(f"✓ Dataset has no missing values")
=== Data Completeness Summary === Total cells: 306,882 Missing cells: 0 Complete cells: 306,882 Completeness rate: 100.0% ✓ All 18 columns are 100% complete ✓ Dataset has no missing values
In [15]:
# Identify numerical columns (excluding object/categorical types)
numerical_cols = ecommerce_df.select_dtypes(include=[np.number]).columns.tolist()
print(f'✓ Identified {len(numerical_cols)} numerical columns:')
for _col in numerical_cols:
print(f' - {_col}')
✓ Identified 9 numerical columns: - Age - Unit_Price - Quantity - Discount_Amount - Total_Amount - Session_Duration_Minutes - Pages_Viewed - Delivery_Time_Days - Customer_Rating
In [17]:
# Compute comprehensive statistics for each numerical column
stats_summary = pd.DataFrame()
for _col in numerical_cols:
_series = ecommerce_df[_col]
stats_summary[_col] = {
'count': _series.count(),
'missing': _series.isna().sum(),
'mean': _series.mean(),
'median': _series.median(),
'std': _series.std(),
'min': _series.min(),
'max': _series.max(),
'q1': _series.quantile(0.25),
'q3': _series.quantile(0.75),
'skewness': scipy_stats.skew(_series.dropna()),
'kurtosis': scipy_stats.kurtosis(_series.dropna())
}
# Transpose for better readability
stats_summary_df = stats_summary.T
print(f'✓ Computed comprehensive statistics for {len(numerical_cols)} columns')
✓ Computed comprehensive statistics for 9 columns
In [18]:
# Display the comprehensive statistics summary
print('═══════════════════════════════════════════════════════════')
print('COMPREHENSIVE STATISTICAL SUMMARY')
print('═══════════════════════════════════════════════════════════')
print()
print(stats_summary_df.round(2).to_string())
print()
print('═══════════════════════════════════════════════════════════')
═══════════════════════════════════════════════════════════
COMPREHENSIVE STATISTICAL SUMMARY
═══════════════════════════════════════════════════════════
count missing mean median std min max q1 q3 skewness kurtosis
Age 17049.0 0.0 34.95 35.00 11.05 18.00 75.00 26.00 42.00 0.32 -0.40
Unit_Price 17049.0 0.0 447.90 174.68 722.32 5.05 7900.01 73.26 494.57 3.65 17.60
Quantity 17049.0 0.0 3.01 3.00 1.42 1.00 5.00 2.00 4.00 -0.01 -1.30
Discount_Amount 17049.0 0.0 69.79 0.00 240.70 0.00 6538.29 0.00 32.71 8.64 120.04
Total_Amount 17049.0 0.0 1277.44 455.85 2358.44 6.21 37852.05 172.97 1267.75 4.58 30.37
Session_Duration_Minutes 17049.0 0.0 14.54 15.00 2.93 4.00 26.00 13.00 17.00 -0.01 0.04
Pages_Viewed 17049.0 0.0 9.00 9.00 2.26 1.00 18.00 7.00 11.00 -0.03 -0.01
Delivery_Time_Days 17049.0 0.0 6.50 6.00 3.49 1.00 25.00 4.00 8.00 1.13 1.87
Customer_Rating 17049.0 0.0 3.90 4.00 1.13 1.00 5.00 3.00 5.00 -0.93 0.09
═══════════════════════════════════════════════════════════
In [20]:
# Identify potential data quality issues
quality_issues = []
# Set the index of stats_summary_df to column names for easier lookup
stats_with_names = stats_summary_df.copy()
stats_with_names.index = numerical_cols
for _col in numerical_cols:
_series = ecommerce_df[_col]
_col_stats = stats_with_names.loc[_col]
# Check for missing values
if _col_stats['missing'] > 0:
_pct = (_col_stats['missing'] / len(ecommerce_df)) * 100
quality_issues.append(f"⚠️ {_col}: {int(_col_stats['missing'])} missing values ({_pct:.2f}%)")
# Check for extreme skewness (> 2 or < -2)
if abs(_col_stats['skewness']) > 2:
quality_issues.append(f"⚠️ {_col}: High skewness ({_col_stats['skewness']:.2f}) - distribution highly skewed")
# Check for extreme kurtosis (> 7 or < -7)
if abs(_col_stats['kurtosis']) > 7:
quality_issues.append(f"⚠️ {_col}: Extreme kurtosis ({_col_stats['kurtosis']:.2f}) - heavy tails or outliers")
# Check for potential outliers using IQR method
_q1 = _col_stats['q1']
_q3 = _col_stats['q3']
_iqr = _q3 - _q1
_lower_bound = _q1 - 1.5 * _iqr
_upper_bound = _q3 + 1.5 * _iqr
_outliers = ((_series < _lower_bound) | (_series > _upper_bound)).sum()
if _outliers > 0:
_pct = (_outliers / len(ecommerce_df)) * 100
quality_issues.append(f"⚠️ {_col}: {_outliers} potential outliers ({_pct:.2f}%) using IQR method")
# Check for zero variance
if _col_stats['std'] == 0:
quality_issues.append(f"⚠️ {_col}: Zero variance - all values are identical")
data_quality_report = quality_issues if quality_issues else ["✓ No major data quality issues detected"]
print(f'Data Quality Assessment: {len(quality_issues)} issues found')
print()
Data Quality Assessment: 13 issues found
In [21]:
# Display data quality issues report
print('═══════════════════════════════════════════════════════════')
print('DATA QUALITY ISSUES REPORT')
print('═══════════════════════════════════════════════════════════')
print()
for _issue in data_quality_report:
print(_issue)
print()
print('═══════════════════════════════════════════════════════════')
═══════════════════════════════════════════════════════════ DATA QUALITY ISSUES REPORT ═══════════════════════════════════════════════════════════ ⚠️ Age: 50 potential outliers (0.29%) using IQR method ⚠️ Unit_Price: High skewness (3.65) - distribution highly skewed ⚠️ Unit_Price: Extreme kurtosis (17.60) - heavy tails or outliers ⚠️ Unit_Price: 1757 potential outliers (10.31%) using IQR method ⚠️ Discount_Amount: High skewness (8.64) - distribution highly skewed ⚠️ Discount_Amount: Extreme kurtosis (120.04) - heavy tails or outliers ⚠️ Discount_Amount: 2789 potential outliers (16.36%) using IQR method ⚠️ Total_Amount: High skewness (4.58) - distribution highly skewed ⚠️ Total_Amount: Extreme kurtosis (30.37) - heavy tails or outliers ⚠️ Total_Amount: 1943 potential outliers (11.40%) using IQR method ⚠️ Session_Duration_Minutes: 85 potential outliers (0.50%) using IQR method ⚠️ Pages_Viewed: 1 potential outliers (0.01%) using IQR method ⚠️ Delivery_Time_Days: 475 potential outliers (2.79%) using IQR method ═══════════════════════════════════════════════════════════
In [22]:
# Create histogram plots for all numerical features
_n_cols = len(numerical_cols)
_n_rows = 3
_n_grid_cols = 3
dist_hist_fig, dist_hist_axes = plt.subplots(_n_rows, _n_grid_cols, figsize=(15, 12))
dist_hist_axes = dist_hist_axes.flatten()
for _idx, _col_name in enumerate(numerical_cols):
_ax = dist_hist_axes[_idx]
_data = ecommerce_df[_col_name].dropna()
# Create histogram with KDE overlay
_ax.hist(_data, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
_ax.set_title(f'{_col_name}', fontsize=11, fontweight='bold')
_ax.set_xlabel('')
_ax.set_ylabel('Frequency', fontsize=9)
_ax.grid(axis='y', alpha=0.3)
_ax.tick_params(labelsize=8)
dist_hist_fig.suptitle('Distribution Histograms - Numerical Features', fontsize=14, fontweight='bold', y=0.995)
dist_hist_fig.tight_layout()
print(f'Created histograms for {len(numerical_cols)} numerical features')
Created histograms for 9 numerical features
In [23]:
# Select only numerical columns for correlation analysis
corr_data = ecommerce_df[numerical_cols].copy()
# Compute correlation matrix using Pearson correlation
correlation_matrix = corr_data.corr()
print(f'Correlation matrix computed for {len(numerical_cols)} numerical variables')
print(f'Matrix shape: {correlation_matrix.shape}')
Correlation matrix computed for 9 numerical variables Matrix shape: (9, 9)
In [24]:
# Identify strong correlations (absolute value > 0.5)
# Exclude diagonal (self-correlation = 1.0)
strong_correlations = []
for _i in range(len(correlation_matrix.columns)):
for _j in range(_i + 1, len(correlation_matrix.columns)):
_var1 = correlation_matrix.columns[_i]
_var2 = correlation_matrix.columns[_j]
_corr_value = correlation_matrix.iloc[_i, _j]
if abs(_corr_value) > 0.5:
strong_correlations.append({
'Variable 1': _var1,
'Variable 2': _var2,
'Correlation': _corr_value,
'Strength': 'Very Strong' if abs(_corr_value) > 0.8 else 'Strong'
})
# Create DataFrame and sort by absolute correlation value
strong_corr_df = pd.DataFrame(strong_correlations)
if len(strong_corr_df) > 0:
strong_corr_df = strong_corr_df.sort_values('Correlation', key=abs, ascending=False)
print(f'Found {len(strong_corr_df)} strong correlations (|r| > 0.5)')
else:
print('No strong correlations found (|r| > 0.5)')
Found 1 strong correlations (|r| > 0.5)
In [25]:
# Display strong correlations in a formatted table
if len(strong_corr_df) > 0:
print('='*70)
print('STRONG CORRELATIONS IDENTIFIED (|r| > 0.5)')
print('='*70)
print()
for _idx, _row in strong_corr_df.iterrows():
_symbol = '📈' if _row['Correlation'] > 0 else '📉'
print(f"{_symbol} {_row['Variable 1']} ↔ {_row['Variable 2']}")
print(f" Correlation: {_row['Correlation']:.3f}")
print(f" Strength: {_row['Strength']}")
# Add interpretation
if abs(_row['Correlation']) > 0.8:
_interpretation = 'Very strong relationship - variables move together'
elif abs(_row['Correlation']) > 0.5:
_interpretation = 'Strong relationship - notable correlation'
print(f" Interpretation: {_interpretation}")
print()
print('='*70)
else:
print('No strong correlations (|r| > 0.5) found between variables')
print('This suggests most features are relatively independent')
====================================================================== STRONG CORRELATIONS IDENTIFIED (|r| > 0.5) ====================================================================== 📈 Unit_Price ↔ Total_Amount Correlation: 0.866 Strength: Very Strong Interpretation: Very strong relationship - variables move together ======================================================================
In [27]:
# Create interactive heatmap using plotly
heatmap_fig = go.Figure(data=go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.index,
colorscale='RdBu_r', # Red-Blue reversed (red for positive, blue for negative)
zmid=0, # Center colorscale at 0
zmin=-1,
zmax=1,
text=np.round(correlation_matrix.values, 2),
texttemplate='%{text}',
textfont={"size": 10},
colorbar=dict(
title="Correlation<br>Coefficient",
tickvals=[-1, -0.5, 0, 0.5, 1],
ticktext=['-1.0<br>Perfect<br>Negative', '-0.5<br>Moderate<br>Negative',
'0.0<br>None', '0.5<br>Moderate<br>Positive', '1.0<br>Perfect<br>Positive']
),
hovertemplate='%{y} vs %{x}<br>Correlation: %{z:.3f}<extra></extra>'
))
heatmap_fig.update_layout(
title={
'text': 'E-Commerce Feature Correlation Matrix<br><sub>Interactive Heatmap | Hover for Details</sub>',
'x': 0.5,
'xanchor': 'center'
},
xaxis={'title': '', 'side': 'bottom'},
yaxis={'title': '', 'autorange': 'reversed'},
width=900,
height=800,
font=dict(size=11)
)
heatmap_fig.show()
print('Interactive correlation heatmap created')
Interactive correlation heatmap created
In [28]:
# Create box plots for all numerical features to identify outliers
_n_cols = len(numerical_cols)
_n_rows = 3
_n_grid_cols = 3
dist_box_fig, dist_box_axes = plt.subplots(_n_rows, _n_grid_cols, figsize=(15, 12))
dist_box_axes = dist_box_axes.flatten()
for _idx, _col_name in enumerate(numerical_cols):
_ax = dist_box_axes[_idx]
_data = ecommerce_df[_col_name].dropna()
# Create box plot
_bp = _ax.boxplot(_data, vert=True, patch_artist=True,
boxprops=dict(facecolor='lightcoral', alpha=0.7),
medianprops=dict(color='darkred', linewidth=2),
flierprops=dict(marker='o', markerfacecolor='red', markersize=3, alpha=0.5))
_ax.set_title(f'{_col_name}', fontsize=11, fontweight='bold')
_ax.set_ylabel('Value', fontsize=9)
_ax.grid(axis='y', alpha=0.3)
_ax.tick_params(labelsize=8)
_ax.set_xticklabels([])
dist_box_fig.suptitle('Box Plots - Numerical Features (Outlier Detection)', fontsize=14, fontweight='bold', y=0.995)
dist_box_fig.tight_layout()
print(f'Created box plots for {len(numerical_cols)} numerical features')
Created box plots for 9 numerical features
In [29]:
# Generate summary insights from distribution analysis
distribution_insights = []
distribution_insights.append("📊 Distribution Analysis Summary")
distribution_insights.append("=" * 50)
distribution_insights.append("")
# Create a copy with the correct index
_stats_indexed = stats_summary_df.copy()
_stats_indexed.index = numerical_cols
# Analyze each numerical column using stats
for _col_name in numerical_cols:
_col_stats = _stats_indexed.loc[_col_name]
_insights = []
# Skewness interpretation
_skew = _col_stats['skewness']
if abs(_skew) < 0.5:
_skew_desc = "approximately symmetric"
elif _skew > 2:
_skew_desc = "highly right-skewed"
elif _skew > 0.5:
_skew_desc = "moderately right-skewed"
elif _skew < -2:
_skew_desc = "highly left-skewed"
else:
_skew_desc = "moderately left-skewed"
_insights.append(f" • Skewness: {_skew_desc} ({_skew:.2f})")
# Outlier information from quality issues
_has_outliers = any(_col_name in issue for issue in quality_issues if "outliers" in issue.lower())
if _has_outliers:
_insights.append(f" • Contains outliers (see quality report)")
distribution_insights.append(f"{_col_name}:")
distribution_insights.extend(_insights)
distribution_insights.append("")
distribution_report = "\n".join(distribution_insights)
print(distribution_report)
📊 Distribution Analysis Summary ================================================== Age: • Skewness: approximately symmetric (0.32) • Contains outliers (see quality report) Unit_Price: • Skewness: highly right-skewed (3.65) • Contains outliers (see quality report) Quantity: • Skewness: approximately symmetric (-0.01) Discount_Amount: • Skewness: highly right-skewed (8.64) • Contains outliers (see quality report) Total_Amount: • Skewness: highly right-skewed (4.58) • Contains outliers (see quality report) Session_Duration_Minutes: • Skewness: approximately symmetric (-0.01) • Contains outliers (see quality report) Pages_Viewed: • Skewness: approximately symmetric (-0.03) • Contains outliers (see quality report) Delivery_Time_Days: • Skewness: moderately right-skewed (1.13) • Contains outliers (see quality report) Customer_Rating: • Skewness: moderately left-skewed (-0.93)
In [30]:
# Identify categorical variables (object and bool dtypes)
categorical_cols = ecommerce_df.select_dtypes(include=['object', 'bool']).columns.tolist()
# Separate business-relevant categorical variables from IDs
id_cols = ['Order_ID', 'Customer_ID', 'Date']
key_categorical = [col for col in categorical_cols if col not in id_cols]
print(f'✓ Found {len(key_categorical)} key categorical variables:')
for _col in key_categorical:
print(f' • {_col}')
✓ Found 6 key categorical variables: • Gender • City • Product_Category • Payment_Method • Device_Type • Is_Returning_Customer
In [31]:
# Generate value counts and unique value summary for each categorical variable
print('Categorical Variables Summary:')
print('=' * 70)
for _cat_var in key_categorical:
unique_count = ecommerce_df[_cat_var].nunique()
print(f'\n{_cat_var}:')
print(f' Unique values: {unique_count}')
print(f' Top 5 values:')
_value_counts = ecommerce_df[_cat_var].value_counts()
for _val, _count in _value_counts.head(5).items():
_percentage = (_count / len(ecommerce_df)) * 100
print(f' • {_val}: {_count:,} ({_percentage:.1f}%)')
Categorical Variables Summary:
======================================================================
Gender:
Unique values: 3
Top 5 values:
• Female: 8,613 (50.5%)
• Male: 8,176 (48.0%)
• Other: 260 (1.5%)
City:
Unique values: 10
Top 5 values:
• Istanbul: 4,402 (25.8%)
• Ankara: 2,422 (14.2%)
• Izmir: 2,072 (12.2%)
• Bursa: 1,721 (10.1%)
• Adana: 1,326 (7.8%)
Product_Category:
Unique values: 8
Top 5 values:
• Sports: 2,248 (13.2%)
• Beauty: 2,212 (13.0%)
• Books: 2,206 (12.9%)
• Food: 2,103 (12.3%)
• Toys: 2,090 (12.3%)
Payment_Method:
Unique values: 5
Top 5 values:
• Credit Card: 6,801 (39.9%)
• Debit Card: 4,321 (25.3%)
• Digital Wallet: 3,276 (19.2%)
• Bank Transfer: 1,763 (10.3%)
• Cash on Delivery: 888 (5.2%)
Device_Type:
Unique values: 3
Top 5 values:
• Mobile: 9,543 (56.0%)
• Desktop: 5,845 (34.3%)
• Tablet: 1,661 (9.7%)
Is_Returning_Customer:
Unique values: 2
Top 5 values:
• True: 15,039 (88.2%)
• False: 2,010 (11.8%)
In [32]:
# Visualize Gender distribution
gender_counts = ecommerce_df['Gender'].value_counts()
_fig, _ax = plt.subplots(figsize=(8, 5))
_bars = _ax.bar(gender_counts.index, gender_counts.values, color=['#FF6B9D', '#4A90E2', '#95E1D3'])
_ax.set_xlabel('Gender', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Gender', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)
# Add value labels on bars
for _bar in _bars:
_height = _bar.get_height()
_ax.text(_bar.get_x() + _bar.get_width()/2., _height,
f'{int(_height):,}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
print(f'✓ Gender distribution visualized')
✓ Gender distribution visualized
In [33]:
# Visualize Product Category distribution - Top 8 categories
product_counts = ecommerce_df['Product_Category'].value_counts()
_fig, _ax = plt.subplots(figsize=(10, 6))
_bars = _ax.barh(product_counts.index, product_counts.values, color='#6C63FF')
_ax.set_xlabel('Count', fontsize=12, fontweight='bold')
_ax.set_ylabel('Product Category', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Product Categories', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='x', alpha=0.3)
_ax.invert_yaxis()
# Add value labels on bars
for _i, (_category, _count) in enumerate(product_counts.items()):
_ax.text(_count, _i, f' {int(_count):,}', va='center', fontsize=10)
plt.tight_layout()
plt.show()
print(f'✓ Product Category distribution visualized')
✓ Product Category distribution visualized
In [34]:
# Visualize Payment Method distribution
payment_counts = ecommerce_df['Payment_Method'].value_counts()
_fig, _ax = plt.subplots(figsize=(10, 6))
_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
_bars = _ax.bar(payment_counts.index, payment_counts.values, color=_colors)
_ax.set_xlabel('Payment Method', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Payment Methods', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)
_ax.tick_params(axis='x', rotation=15)
# Add value labels on bars
for _bar in _bars:
_height = _bar.get_height()
_ax.text(_bar.get_x() + _bar.get_width()/2., _height,
f'{int(_height):,}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()
print(f'✓ Payment Method distribution visualized')
✓ Payment Method distribution visualized
In [35]:
# Visualize Device Type distribution
device_counts = ecommerce_df['Device_Type'].value_counts()
_fig, _ax = plt.subplots(figsize=(8, 8))
_colors = ['#FF9F43', '#5F27CD', '#00D2D3']
_wedges, _texts, _autotexts = _ax.pie(device_counts.values, labels=device_counts.index,
autopct='%1.1f%%', startangle=90, colors=_colors,
textprops={'fontsize': 11, 'weight': 'bold'})
_ax.set_title('Distribution of Device Types', fontsize=14, fontweight='bold', pad=20)
# Add count labels
for _i, (_device, _count) in enumerate(device_counts.items()):
_texts[_i].set_text(f'{_device}\n({_count:,})')
plt.tight_layout()
plt.show()
print(f'✓ Device Type distribution visualized')
✓ Device Type distribution visualized
In [36]:
# Visualize Top 10 Cities distribution
city_counts = ecommerce_df['City'].value_counts().head(10)
_fig, _ax = plt.subplots(figsize=(10, 6))
_bars = _ax.barh(city_counts.index, city_counts.values, color='#26A69A')
_ax.set_xlabel('Count', fontsize=12, fontweight='bold')
_ax.set_ylabel('City', fontsize=12, fontweight='bold')
_ax.set_title('Top 10 Cities by Order Count', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='x', alpha=0.3)
_ax.invert_yaxis()
# Add value labels on bars
for _i, (_city, _count) in enumerate(city_counts.items()):
_percentage = (_count / len(ecommerce_df)) * 100
_ax.text(_count, _i, f' {int(_count):,} ({_percentage:.1f}%)', va='center', fontsize=9)
plt.tight_layout()
plt.show()
print(f'✓ Top cities distribution visualized')
✓ Top cities distribution visualized
In [37]:
# Visualize Returning Customer distribution
returning_counts = ecommerce_df['Is_Returning_Customer'].value_counts()
_fig, _ax = plt.subplots(figsize=(8, 5))
_colors = ['#27AE60', '#E74C3C']
_labels = ['Returning', 'New']
_bars = _ax.bar(_labels, [returning_counts[True], returning_counts[False]], color=_colors)
_ax.set_xlabel('Customer Type', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Returning vs New Customers', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)
# Add value labels on bars
for _i, _val in enumerate([returning_counts[True], returning_counts[False]]):
_percentage = (_val / len(ecommerce_df)) * 100
_ax.text(_i, _val, f'{int(_val):,}\n({_percentage:.1f}%)',
ha='center', va='bottom', fontsize=11, fontweight='bold')
plt.tight_layout()
plt.show()
print(f'✓ Returning customer distribution visualized')
✓ Returning customer distribution visualized